Introduction: From two data sets called Beer and Breweries provided by Budweiser, we can find important information for Budweiser. The facts we focus on are alcohol, Beer size, Alcohol by volume of the beer (ABV), and International Bitterness Units of the beer (IBU). This data set consists of information about 2410 US craft beers from 558 US breweries. This data analysis aims to provide descriptive information about the current craft beer market in the United States.
Lets Load some libraries
library(tidyverse)
library(gridExtra)
library(class)
library(usmap)
library(ggplot2)
library(foreign)
library(haven)
library(ggplot2)
library(foreign)
library(ggplot2)
library(GGally)
library(haven)
library(magrittr)
library(data.table)
library(dplyr)
library(plyr)
library(dplyr)
library(factoextra)
library(ggplot2)
library(ggmap)
library(nycflights13)
library(tidyverse)
library(datasets)
library(readxl)
library(tidyverse)
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(plyr)
library(gridExtra)
library(factoextra)
library(GGally)
library(readxl)
library(tidyverse)
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(gridExtra)
library(factoextra)
library(GGally)
library(gridExtra)
library(graphics)
library(mice)
library(PerformanceAnalytics)
require(PerformanceAnalytics)
library(MASS)
library(reshape)
Lets load the data
Beers4=read.csv("/Users/owner/Desktop/homework/unit8/Beers 4.csv")
Breweries = read.csv("/Users/owner/Desktop/homework/unit8/Breweries.csv")
Lets look at the varibles
colnames(Breweries)
## [1] "Brew_ID" "Name" "City" "State"
colnames(Beers4)
## [1] "Name" "Beer_ID" "ABV" "IBU" "Brewery_id"
## [6] "Style" "Ounces"
State=as.factor(Breweries$State)
levels(State)
## [1] " AK" " AL" " AR" " AZ" " CA" " CO" " CT" " DC" " DE" " FL" " GA" " HI"
## [13] " IA" " ID" " IL" " IN" " KS" " KY" " LA" " MA" " MD" " ME" " MI" " MN"
## [25] " MO" " MS" " MT" " NC" " ND" " NE" " NH" " NJ" " NM" " NV" " NY" " OH"
## [37] " OK" " OR" " PA" " RI" " SC" " SD" " TN" " TX" " UT" " VA" " VT" " WA"
## [49] " WI" " WV" " WY"
Colorado has the highest number (47) of breweries, and California has 39 breweries, which is second. In Colorado, most brewery are in Boulder and Denver. Also, California most breweries are in San Diego.
df1 = data.frame(table(Breweries$State))
colnames(df1) = c("State","Breweries")
df1 = df1[order(-df1$Breweries),]
row.names(df1) = NULL
df1
## State Breweries
## 1 CO 47
## 2 CA 39
## 3 MI 32
## 4 OR 29
## 5 TX 28
## 6 PA 25
## 7 MA 23
## 8 WA 23
## 9 IN 22
## 10 WI 20
## 11 NC 19
## 12 IL 18
## 13 NY 16
## 14 VA 16
## 15 FL 15
## 16 OH 15
## 17 MN 12
## 18 AZ 11
## 19 VT 10
## 20 ME 9
## 21 MO 9
## 22 MT 9
## 23 CT 8
## 24 AK 7
## 25 GA 7
## 26 MD 7
## 27 OK 6
## 28 IA 5
## 29 ID 5
## 30 LA 5
## 31 NE 5
## 32 RI 5
## 33 HI 4
## 34 KY 4
## 35 NM 4
## 36 SC 4
## 37 UT 4
## 38 WY 4
## 39 AL 3
## 40 KS 3
## 41 NH 3
## 42 NJ 3
## 43 TN 3
## 44 AR 2
## 45 DE 2
## 46 MS 2
## 47 NV 2
## 48 DC 1
## 49 ND 1
## 50 SD 1
## 51 WV 1
df1$state = trimws(as.character(df1$State))
p = plot_usmap(data = df1, values = "Breweries",labels = TRUE, color = "red") +
scale_fill_continuous(low = "white", high = "red",
name = "Breweries", label = scales::comma) +
theme(legend.position = "right")
p$layers[[2]]$aes_params$size <- 2.5
p
US map also shows the highest number of breweries, and California is the second.
merged = merge(x = Beers4, y = Breweries, by.x = "Brewery_id", by.y = "Brew_ID", all.x = TRUE)
Beer = merged$Name.x
Brewery=merged$Name.y
head(merged,6)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(merged,6)
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#visualize the missing data
sum(is.na(merged))
## [1] 1067
which(is.na(merged))
## [1] 7305 7306 7416 7423 7457 7482 7670 7671 7738 7739 7740 7741
## [13] 7798 7799 7800 7801 7802 7803 7804 7859 8019 8021 8023 8130
## [25] 8222 8223 8224 8226 8388 8528 8654 8710 8861 9117 9122 9149
## [37] 9154 9181 9326 9407 9408 9414 9457 9464 9472 9489 9490 9491
## [49] 9492 9526 9575 9581 9582 9583 9584 9585 9586 9587 9595 9600
## [61] 9624 9625 9657 9675 9676 9677 9678 9679 9680 9681 9682 9683
## [73] 9684 9685 9686 9688 9689 9691 9692 9694 9695 9696 9697 9698
## [85] 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710
## [97] 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722
## [109] 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734
## [121] 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746
## [133] 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9764
## [145] 9765 9767 9770 9775 9779 9785 9792 9798 9799 9800 9803 9804
## [157] 9805 9808 9809 9812 9815 9826 9833 9834 9838 9843 9854 9855
## [169] 9856 9858 9860 9867 9875 9877 9878 9884 9892 9893 9894 9895
## [181] 9896 9897 9899 9900 9903 9907 9909 9910 9911 9912 9913 9914
## [193] 9917 9918 9927 9931 9932 9933 9934 9936 9937 9938 9939 9940
## [205] 9941 9942 9943 9944 9945 9946 9947 9948 9949 9951 9968 9969
## [217] 9978 10000 10001 10002 10003 10006 10007 10010 10011 10012 10013 10015
## [229] 10016 10019 10020 10024 10025 10027 10030 10031 10032 10033 10035 10037
## [241] 10039 10044 10050 10051 10052 10053 10056 10057 10058 10059 10060 10062
## [253] 10063 10079 10080 10081 10083 10085 10086 10087 10088 10089 10090 10091
## [265] 10093 10094 10097 10099 10104 10105 10108 10117 10118 10121 10125 10132
## [277] 10133 10135 10136 10142 10143 10144 10145 10146 10147 10148 10149 10150
## [289] 10151 10155 10178 10179 10180 10181 10186 10187 10188 10189 10197 10199
## [301] 10201 10202 10208 10209 10210 10211 10212 10213 10214 10215 10216 10225
## [313] 10226 10227 10229 10236 10237 10238 10239 10240 10251 10252 10253 10254
## [325] 10255 10256 10261 10266 10267 10268 10269 10274 10275 10278 10284 10294
## [337] 10295 10296 10297 10299 10301 10302 10303 10304 10305 10306 10322 10323
## [349] 10336 10344 10353 10355 10358 10363 10365 10369 10372 10385 10388 10389
## [361] 10391 10398 10402 10406 10416 10421 10429 10430 10431 10433 10435 10445
## [373] 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457
## [385] 10465 10466 10468 10469 10470 10471 10478 10479 10480 10481 10484 10488
## [397] 10489 10493 10496 10501 10502 10503 10515 10516 10517 10518 10519 10520
## [409] 10521 10522 10523 10527 10528 10529 10530 10531 10532 10540 10541 10551
## [421] 10552 10553 10554 10556 10572 10574 10575 10587 10591 10595 10596 10597
## [433] 10598 10599 10600 10603 10604 10606 10607 10630 10631 10632 10633 10634
## [445] 10635 10636 10640 10641 10643 10650 10651 10653 10662 10663 10664 10668
## [457] 10669 10672 10673 10674 10675 10690 10691 10694 10695 10696 10697 10698
## [469] 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710
## [481] 10711 10712 10713 10714 10715 10716 10717 10719 10720 10721 10722 10723
## [493] 10724 10725 10726 10727 10728 10729 10730 10731 10733 10734 10736 10737
## [505] 10743 10758 10759 10760 10761 10762 10763 10768 10773 10780 10781 10794
## [517] 10795 10796 10797 10798 10799 10800 10801 10803 10819 10820 10821 10822
## [529] 10824 10825 10827 10828 10831 10838 10839 10840 10841 10842 10845 10846
## [541] 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10860
## [553] 10870 10871 10872 10873 10875 10877 10878 10879 10880 10881 10882 10883
## [565] 10894 10895 10905 10908 10926 10929 10932 10933 10937 10938 10939 10940
## [577] 10941 10942 10943 10950 10957 10967 10968 10969 10970 10971 10972 10979
## [589] 10991 11002 11010 11013 11014 11017 11019 11021 11022 11023 11026 11027
## [601] 11028 11029 11030 11031 11032 11034 11036 11038 11039 11040 11041 11044
## [613] 11051 11053 11054 11055 11056 11057 11058 11059 11062 11063 11064 11066
## [625] 11067 11071 11073 11083 11086 11087 11088 11089 11090 11091 11096 11108
## [637] 11109 11110 11111 11117 11118 11119 11120 11127 11132 11141 11143 11146
## [649] 11148 11149 11150 11151 11155 11163 11165 11182 11184 11185 11186 11187
## [661] 11188 11190 11191 11192 11193 11196 11197 11203 11205 11206 11208 11209
## [673] 11210 11211 11212 11213 11215 11216 11217 11218 11220 11225 11229 11232
## [685] 11236 11237 11240 11241 11242 11243 11244 11245 11247 11250 11253 11254
## [697] 11260 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277
## [709] 11280 11290 11291 11294 11295 11301 11302 11303 11304 11305 11306 11307
## [721] 11308 11314 11315 11316 11317 11318 11319 11320 11323 11324 11325 11337
## [733] 11346 11347 11348 11351 11352 11367 11368 11369 11375 11377 11378 11379
## [745] 11380 11381 11382 11383 11385 11386 11388 11389 11394 11395 11401 11405
## [757] 11406 11412 11419 11420 11421 11422 11423 11424 11425 11428 11429 11432
## [769] 11444 11445 11446 11452 11453 11456 11472 11474 11475 11476 11477 11479
## [781] 11480 11481 11484 11494 11495 11501 11502 11503 11504 11514 11516 11517
## [793] 11518 11519 11522 11526 11527 11528 11529 11530 11531 11532 11533 11534
## [805] 11535 11540 11549 11554 11555 11556 11557 11558 11559 11560 11561 11562
## [817] 11563 11564 11565 11566 11569 11570 11571 11572 11579 11580 11581 11582
## [829] 11583 11590 11591 11592 11593 11594 11599 11600 11601 11602 11603 11606
## [841] 11613 11614 11615 11616 11617 11618 11619 11620 11623 11624 11625 11631
## [853] 11632 11633 11634 11635 11636 11637 11639 11640 11641 11642 11644 11646
## [865] 11647 11648 11649 11650 11654 11655 11656 11657 11658 11662 11663 11672
## [877] 11673 11674 11675 11676 11680 11681 11682 11683 11685 11686 11688 11689
## [889] 11690 11693 11695 11703 11704 11705 11706 11711 11712 11713 11714 11715
## [901] 11716 11717 11718 11722 11724 11725 11726 11727 11728 11729 11734 11735
## [913] 11736 11742 11745 11746 11752 11753 11754 11755 11765 11772 11774 11775
## [925] 11776 11777 11783 11784 11785 11786 11787 11792 11794 11795 11796 11797
## [937] 11798 11799 11800 11808 11809 11810 11811 11812 11815 11817 11818 11819
## [949] 11821 11822 11823 11824 11825 11826 11827 11836 11837 11838 11841 11844
## [961] 11845 11846 11854 11855 11856 11857 11867 11868 11874 11875 11879 11882
## [973] 11885 11888 11892 11893 11894 11899 11900 11901 11902 11905 11906 11907
## [985] 11909 11910 11911 11912 11924 11926 11927 11928 11930 11931 11932 11933
## [997] 11934 11935 11936 11945 11946 11947 11948 11949 11950 11951 11956 11957
## [1009] 11963 11964 11965 11966 11967 11970 11971 11972 11973 11982 11983 11984
## [1021] 11985 11987 11991 11992 11993 11994 11995 11996 11997 12001 12002 12003
## [1033] 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015
## [1045] 12018 12019 12022 12023 12032 12033 12034 12035 12036 12037 12038 12039
## [1057] 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050
plot_missing(merged)
p=function(x) {sum(is.na(x))/length(x)*100}
apply(merged,2,p)
## Brewery_id Name.x Beer_ID ABV IBU Style Ounces
## 0.000000 0.000000 0.000000 2.572614 41.701245 0.000000 0.000000
## Name.y City State
## 0.000000 0.000000 0.000000
md.pattern(merged)
## Brewery_id Name.x Beer_ID Style Ounces Name.y City State ABV IBU
## 1405 1 1 1 1 1 1 1 1 1 1 0
## 943 1 1 1 1 1 1 1 1 1 0 1
## 62 1 1 1 1 1 1 1 1 0 0 2
## 0 0 0 0 0 0 0 0 62 1005 1067
md.pairs(merged)
## $rr
## Brewery_id Name.x Beer_ID ABV IBU Style Ounces Name.y City State
## Brewery_id 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## Name.x 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## Beer_ID 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## ABV 2348 2348 2348 2348 1405 2348 2348 2348 2348 2348
## IBU 1405 1405 1405 1405 1405 1405 1405 1405 1405 1405
## Style 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## Ounces 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## Name.y 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## City 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
## State 2410 2410 2410 2348 1405 2410 2410 2410 2410 2410
##
## $rm
## Brewery_id Name.x Beer_ID ABV IBU Style Ounces Name.y City State
## Brewery_id 0 0 0 62 1005 0 0 0 0 0
## Name.x 0 0 0 62 1005 0 0 0 0 0
## Beer_ID 0 0 0 62 1005 0 0 0 0 0
## ABV 0 0 0 0 943 0 0 0 0 0
## IBU 0 0 0 0 0 0 0 0 0 0
## Style 0 0 0 62 1005 0 0 0 0 0
## Ounces 0 0 0 62 1005 0 0 0 0 0
## Name.y 0 0 0 62 1005 0 0 0 0 0
## City 0 0 0 62 1005 0 0 0 0 0
## State 0 0 0 62 1005 0 0 0 0 0
##
## $mr
## Brewery_id Name.x Beer_ID ABV IBU Style Ounces Name.y City State
## Brewery_id 0 0 0 0 0 0 0 0 0 0
## Name.x 0 0 0 0 0 0 0 0 0 0
## Beer_ID 0 0 0 0 0 0 0 0 0 0
## ABV 62 62 62 0 0 62 62 62 62 62
## IBU 1005 1005 1005 943 0 1005 1005 1005 1005 1005
## Style 0 0 0 0 0 0 0 0 0 0
## Ounces 0 0 0 0 0 0 0 0 0 0
## Name.y 0 0 0 0 0 0 0 0 0 0
## City 0 0 0 0 0 0 0 0 0 0
## State 0 0 0 0 0 0 0 0 0 0
##
## $mm
## Brewery_id Name.x Beer_ID ABV IBU Style Ounces Name.y City State
## Brewery_id 0 0 0 0 0 0 0 0 0 0
## Name.x 0 0 0 0 0 0 0 0 0 0
## Beer_ID 0 0 0 0 0 0 0 0 0 0
## ABV 0 0 0 62 62 0 0 0 0 0
## IBU 0 0 0 62 1005 0 0 0 0 0
## Style 0 0 0 0 0 0 0 0 0 0
## Ounces 0 0 0 0 0 0 0 0 0 0
## Name.y 0 0 0 0 0 0 0 0 0 0
## City 0 0 0 0 0 0 0 0 0 0
## State 0 0 0 0 0 0 0 0 0 0
ABV and IBU columns contain missing values. The ABV column contains 62 missing values, and the IBU column contains 1005 missing values. Since ABV contains fewer missing values, we can use medians at the state level to replace those values. IBU contains many missing values. So, we use a predictive regression model to predict the IBU value based on ABV.
Now, we will replace the missing data with the median
for(i in 1:ncol(merged))
{
if(is.numeric(merged[,i]))
{
merged[is.na(merged[,i]), i] <- median(merged[,i], na.rm = TRUE)
}
}
Missing map after replacing of the data
plot_missing(merged)
ABV.med = aggregate(ABV ~ State, data = merged, FUN = median)
IBU.med = aggregate(IBU ~ State, data = merged, FUN = median)
ABV.med
## State ABV
## 1 AK 0.0560
## 2 AL 0.0600
## 3 AR 0.0520
## 4 AZ 0.0560
## 5 CA 0.0580
## 6 CO 0.0590
## 7 CT 0.0600
## 8 DC 0.0625
## 9 DE 0.0555
## 10 FL 0.0560
## 11 GA 0.0550
## 12 HI 0.0540
## 13 IA 0.0555
## 14 ID 0.0565
## 15 IL 0.0580
## 16 IN 0.0580
## 17 KS 0.0500
## 18 KY 0.0600
## 19 LA 0.0520
## 20 MA 0.0540
## 21 MD 0.0580
## 22 ME 0.0510
## 23 MI 0.0600
## 24 MN 0.0560
## 25 MO 0.0550
## 26 MS 0.0580
## 27 MT 0.0550
## 28 NC 0.0570
## 29 ND 0.0500
## 30 NE 0.0560
## 31 NH 0.0550
## 32 NJ 0.0460
## 33 NM 0.0610
## 34 NV 0.0580
## 35 NY 0.0550
## 36 OH 0.0580
## 37 OK 0.0600
## 38 OR 0.0560
## 39 PA 0.0560
## 40 RI 0.0550
## 41 SC 0.0550
## 42 SD 0.0600
## 43 TN 0.0570
## 44 TX 0.0560
## 45 UT 0.0400
## 46 VA 0.0565
## 47 VT 0.0550
## 48 WA 0.0555
## 49 WI 0.0520
## 50 WV 0.0620
## 51 WY 0.0500
IBU.med
## State IBU
## 1 AK 35.0
## 2 AL 39.5
## 3 AR 35.0
## 4 AZ 35.0
## 5 CA 35.0
## 6 CO 35.0
## 7 CT 35.0
## 8 DC 35.0
## 9 DE 43.5
## 10 FL 35.0
## 11 GA 35.0
## 12 HI 35.0
## 13 IA 29.5
## 14 ID 35.0
## 15 IL 35.0
## 16 IN 35.0
## 17 KS 22.0
## 18 KY 35.0
## 19 LA 35.0
## 20 MA 35.0
## 21 MD 35.0
## 22 ME 35.0
## 23 MI 35.0
## 24 MN 38.0
## 25 MO 32.5
## 26 MS 45.0
## 27 MT 35.0
## 28 NC 35.0
## 29 ND 32.0
## 30 NE 35.0
## 31 NH 35.0
## 32 NJ 34.5
## 33 NM 35.0
## 34 NV 35.0
## 35 NY 35.0
## 36 OH 35.0
## 37 OK 35.0
## 38 OR 35.0
## 39 PA 35.0
## 40 RI 32.0
## 41 SC 35.0
## 42 SD 35.0
## 43 TN 36.0
## 44 TX 35.0
## 45 UT 35.0
## 46 VA 37.5
## 47 VT 35.0
## 48 WA 35.0
## 49 WI 35.0
## 50 WV 57.5
## 51 WY 32.0
We are going to show ABV and IBU in different states in bar plots:
P.ABV = ggplot(ABV.med, aes(x = ABV, y = State)) +
geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
labs(title = "Median ABV") +
theme_bw() +
theme(text = element_text(size = 8.1))
P.IBU = ggplot(IBU.med, aes(x = IBU, y = State)) +
geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
labs(title = "Median IBU") +
theme_bw()+
theme(text = element_text(size = 8.1))
grid.arrange(P.ABV, P.IBU,ncol = 2)
df2=subset(merged,select=c("State","ABV","IBU"))
maxABV=subset(df2,ABV==max(ABV,na.rm = TRUE))
maxABV
## State ABV IBU
## 375 CO 0.128 35
maxIBU=subset(df2,IBU==max(IBU,na.rm = TRUE))
maxIBU
## State ABV IBU
## 1857 OR 0.082 138
ABV.max = aggregate(ABV ~ State, data = merged, FUN = max)
ABV.max[order(-ABV.max$ABV),][1,]
## State ABV
## 6 CO 0.128
IBU.max = aggregate(IBU ~ State, data = merged, FUN = max)
IBU.max[order(-IBU.max$IBU),][1,]
## State IBU
## 38 OR 138
abv = ABV.max[order(-ABV.max$ABV),][1:5,]
ggplot(abv, aes(x = ABV*100, y = reorder(State,ABV))) +
geom_bar(stat = "identity", width = 0.5, color = "orange", fill = "gold") +
labs(title = "Top 5 States with maximum ABV values", y = "State", x = "ABV percentage") +
theme_bw()+
geom_text(aes(label = paste0(ABV*100,"%"))) +
theme(text = element_text(size = 10))
ibu = IBU.max[order(-IBU.max$IBU),][1:5,]
ggplot(ibu, aes(x = IBU, y = reorder(State,IBU))) +
geom_bar(stat = "identity", width = 0.5, color = "orange", fill = "gold") +
labs(title = "Top 5 States with maximum IBU values", y ="State") +
theme_bw()+
geom_text(aes(label = IBU)) +
theme(text = element_text(size = 10))
Colorado has the Maximum ABV at 0.128. Oregon has the Maximum IBU at 138.
summary(merged$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00100 0.05000 0.05600 0.05968 0.06700 0.12800
ggplot(merged, aes(x = ABV)) +
geom_histogram(color="blue", fill="skyblue") +
theme_bw() +
ggtitle("Histogram for ABV")
Null hypothesis: the ABV is normally distributed?
shapiro.test(merged$ABV)
##
## Shapiro-Wilk normality test
##
## data: merged$ABV
## W = 0.93421, p-value < 2.2e-16
ABV had not a normal distribution
ggplot(df2, aes(x = ABV, y = IBU)) +
geom_point(color = "blue")+
geom_smooth(method = "lm",se = F, color = "red")+
ggtitle("Scatterplot for ABV vs IBU") +
theme_bw()
ggplot(df2, aes(ABV,IBU,color="blue")) +
geom_point(color="blue")+
geom_smooth()
chart.Correlation(df2[,-1],histogram=TRUE,pch=100)
ABV and IBU was correlated to each other
Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). We decide to use KNN classification to investigate this relationship. We will provide statistical evidence one way or the other.
minDF = subset(merged, select = c("ABV","IBU","Style","State"),
is.na(ABV) == FALSE & is.na(IBU) == FALSE)
minDF$S1 = NA
for(k in 1:nrow(minDF))
{
if(grepl("IPA", minDF$Style[k], fixed = TRUE) == TRUE)
{
minDF$S1[k] = "IPA"
}
else if(grepl("Ale", minDF$Style[k], fixed = TRUE) == TRUE)
{
minDF$S1[k] = "Ale"
}
}
clsdf = subset(minDF, select = c("ABV","IBU","S1","State"), is.na(S1) == FALSE)
clsdf = dplyr::rename(clsdf,Style = S1)
The relationship of ABV and IBU in terms of Ale and IPA
ggplot(clsdf,aes(ABV,IBU,color=Style))+
geom_point(shape=4,size=2)+
geom_smooth(method=loess,se=F)
Box plot of ABV in Ale and IPA
ggplot(clsdf,aes(Style,ABV,fill=Style))+geom_boxplot()
Box plot of IBU in Ale and IPA
ggplot(clsdf,aes(Style,IBU,fill=Style))+geom_boxplot()
The relationship between IBU and ABV in terms of Style(Ale,IPA)
ggplot(clsdf,aes(ABV,IBU,color=Style))+
geom_point(shape=4,size=2)+
geom_smooth(method=lm,se=F)
to Compare Two Variances
var.test(IBU~Style,data=clsdf)
##
## F test to compare two variances
##
## data: IBU by Style
## F = 0.33279, num df = 962, denom df = 570, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.2870047 0.3848152
## sample estimates:
## ratio of variances
## 0.3327944
Two variance were different
ale=subset(clsdf,Style=="Ale")
ipa=subset(clsdf,Style=="IPA")
To test normality of the variables
shapiro.test(ale$IBU)
##
## Shapiro-Wilk normality test
##
## data: ale$IBU
## W = 0.85296, p-value < 2.2e-16
shapiro.test(ipa$IBU)
##
## Shapiro-Wilk normality test
##
## data: ipa$IBU
## W = 0.89739, p-value < 2.2e-16
shapiro.test(ale$ABV)
##
## Shapiro-Wilk normality test
##
## data: ale$ABV
## W = 0.90472, p-value < 2.2e-16
shapiro.test(ipa$ABV)
##
## Shapiro-Wilk normality test
##
## data: ipa$ABV
## W = 0.97267, p-value = 7.813e-09
The variables had not normal distribution
To plot the normality of the variables
qqnorm(ale$IBU)
qqnorm(ale$ABV)
qqnorm(ipa$IBU)
qqnorm(ipa$ABV)
To test the different between ABV in diffrenet styles(IPA and Ale), we should use Mann withney test
wilcox.test(clsdf$ABV ~ clsdf$Style)
##
## Wilcoxon rank sum test with continuity correction
##
## data: clsdf$ABV by clsdf$Style
## W = 116772, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
ABV and Style had a significant difference
wilcox.test(clsdf$IBU ~ clsdf$Style)
##
## Wilcoxon rank sum test with continuity correction
##
## data: clsdf$IBU by clsdf$Style
## W = 97264, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
IBU and Style had a significant difference
Using KNN:
clsdf = subset(clsdf, select = c("ABV","IBU","Style"))
idx = sample.int(n = nrow(clsdf), size = floor(0.7*nrow(clsdf)), replace = F)
train = clsdf[idx,]
test = clsdf[-idx,]
trn_target = train$Style
trn = train[,-3]
tst_target = test$Style
tst = test[,-3]
pred = knn(train = trn, test = tst, cl = trn_target, k = 6)
head(pred)
## [1] Ale Ale IPA Ale IPA IPA
## Levels: Ale IPA
To show confusion matrix:
model_table=table(tst_target,pred)
model_table
## pred
## tst_target Ale IPA
## Ale 234 38
## IPA 52 137
To test the accuracy:
sum(diag(model_table))/nrow(tst)
## [1] 0.8047722
Accuracy = NULL
mis = NULL
sen = NULL
spe = NULL
for(i in 1:50)
{
pred = knn(train = trn, test = tst, cl = trn_target, k = i)
head(pred)
model_table=table(trn_target)
tab = table(Predicted = pred, Real = tst_target)
Accuracy[i] = ((tab[1,1] + tab[2,2])/sum(tab))*100
mis[i] = round((tab[1,2]+tab[2,1])/sum(tab),2)
sen[i] = round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
spe[i] = round(tab[1,1]/(tab[1,1]+tab[2,1]),2)
}
plot(x = c(1:50), y = Accuracy, xlab = "k", pch = 19, type = "b")
abline(v = which.max(Accuracy), col = "red", lwd = 2)
data.frame(Measure = c("Accuracy","Misclassification Rate","Sensitivity","Specificity"),
Value = c(round(Accuracy[6],2),round(mis[6],2),round(sen[6],2),round(spe[6],2)))
## Measure Value
## 1 Accuracy 80.69
## 2 Misclassification Rate 0.19
## 3 Sensitivity 0.71
## 4 Specificity 0.87
9.Find one other useful inference from the data that you feel Budweiser may be able to find value in.
oz = aggregate(Ounces ~ State, data = merged, FUN = sum)
ggplot(oz, aes(x = Ounces, y = reorder(State,Ounces))) +
geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
labs(title = "Median Ounces per State", y = " Ounces") +
theme_bw() +
theme(text = element_text(size = 8.1))
co = subset(merged, State == " CO")
oz_co = aggregate(Ounces ~ City, data = co, FUN = sum)
ggplot(oz_co, aes(x = Ounces, y = reorder(City,Ounces))) +
geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
labs(title = "Number of Ounces for Cities in Colorado", y = " Ounces") +
theme_bw() +
theme(text = element_text(size = 8.1))
To evaluate the relationship between Ounces in different cities of Colorado
Anova_result= aov(Ounces ~ City, data = co)
summary(Anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## City 26 415.3 15.972 3.318 5.61e-07 ***
## Residuals 238 1145.5 4.813
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
minDF1 = subset(co, select = c("Ounces","Style"),is.na(Ounces) == FALSE)
minDF1$S1 = NA
for(k in 1:nrow(minDF1))
{
if(grepl("IPA", minDF1$Style[k], fixed = TRUE) == TRUE)
{
minDF1$S1[k] = "IPA"
}
else if(grepl("Ale", minDF1$Style[k], fixed = TRUE) == TRUE)
{
minDF1$S1[k] = "Ale"
}
}
aggregate(Ounces ~ S1, data = minDF1, FUN = sum)
## S1 Ounces
## 1 Ale 1600.0
## 2 IPA 770.4
Test Normality of Ounces
shapiro.test(minDF1$Ounces)
##
## Shapiro-Wilk normality test
##
## data: minDF1$Ounces
## W = 0.6026, p-value < 2.2e-16
qqnorm(minDF1$Ounces)
Evaluation of the Ounces between two styles(IPA and Ale) in different cities in Colorado
wilcox.test(minDF1$Ounces ~ minDF1$S1)
##
## Wilcoxon rank sum test with continuity correction
##
## data: minDF1$Ounces by minDF1$S1
## W = 3196, p-value = 0.3247
## alternative hypothesis: true location shift is not equal to 0
The difference between Ounces and styles wasnot significant